gvc_agora_opentargets

Setup environment

library(tidyverse)
library(janitor)
library(broom)
library(readxl)
library(jsonlite)

library(gprofiler2)

theme_set(theme_bw())

set.seed(666)

Read and prep data

GVC genes (within 1Mb flanking regions of GVC loci) [OLD]

gvc <-
  read_xlsx("GVC_1Mb_comparison_050224.xlsx") %>%
  clean_names() %>% 
  separate(gene_id, c("gene_id", "version")) %>%
  select(-version, -agora_nominated_list, -opentarget_info)

gvc
gvc.genes <-
  gvc %>%
  arrange(absolute_distance) %>%
  distinct(gene_id, .keep_all = TRUE) %>%
  select(gene_id, gene_symbol, absolute_distance) %>%
  arrange(gene_symbol)

gvc.genes
gvc.genes %>% distinct(gene_symbol) %>% nrow()
[1] 1344

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes

Remove genes in APOE and HLA loci and manually add APOE and HLA genes (based on Bellenguez2022):

gvc.genes.apoe_hla <- gvc.genes %>% filter(gene_id %in% c("ENSG00000130203", "ENSG00000196735", "ENSG00000179344", "ENSG00000196126"))

gvc.genes.apoe_hla
gvc.minus_apoe_hla <- gvc %>% filter(grouped_loci_gvc != "APOE / TOMM40" & grouped_loci_gvc != "HLA")

gvc.minus_apoe_hla
gvc.genes.minus_apoe_hla <-
  gvc.minus_apoe_hla %>%
  arrange(absolute_distance) %>%
  distinct(gene_id, .keep_all = TRUE) %>%
  select(gene_id, gene_symbol, absolute_distance) %>%
  bind_rows(gvc.genes.apoe_hla) %>%
  arrange(gene_symbol)

gvc.genes.minus_apoe_hla

Agora genes

Alzheimer’s disease gene prioritization scores from Agora (see also related journal article):

ago1 <- read_json("agora.syn25741025.overall_scores.v12.2024-10-24.json", simplifyVector = TRUE) %>% as_tibble()

ago1

Alzheimer’s disease genes (Agora nominated targets):

https://agora.adknowledgeportal.org/genes/nominated-targets

ago2 <- read_csv("agora.nominated-targets.gene-list.2024-10-24.csv")
ago2
ago <- ago1 %>% filter(hgnc_symbol %in% ago2$`Gene Symbol`)

ago

OpenTargets genes

Alzheimer’s disease gene prioritization scores from OpenTargets:

ot <- read_tsv("OT-MONDO_0004975-associated-targets-6_4_2024-v24_03.tsv", show_col_types = FALSE, na = "No data")

# ot <- read_tsv("OT-MONDO_0004975-associated-targets-10_24_2024-v24_09.tsv", show_col_types = FALSE, na = "No data")

ot

Add Ensembl Gene IDs (WTF!):

otcols <- colnames(ot)
otensg <- gconvert(
  query = ot$symbol,
  organism = "hsapiens",
  target= "ENSG",
  mthreshold = Inf,
  filter_na = TRUE) %>% 
  mutate(input_number = as.character(input_number)) %>%
  left_join(ot %>% rownames_to_column(var = "input_number"), by = "input_number") %>% 
  select(ensembl_gene_id = target, otcols)

otensg

Overlaps between GVC, Agora, and OpenTargets genes

x = list(
  "GVC" = gvc.genes$gene_id,
  "Agora" = ago$ensembl_gene_id,
  "OpenTargets" = otensg$ensembl_gene_id
)
library(VennDiagram)
grid.newpage()
v <- venn.diagram(
  x,
  fill = c("#FF0000", "#00FF00", "#0000FF"),
  filename = NULL)
grid.draw(v)

p <- get.venn.partitions(x)
p

ORA of genes in overlaps

GVC ∩ Agora ∩ OpenTargets

genes <- p %>% 
  filter(..set.. == "GVC∩Agora∩OpenTargets") %>%
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)
# save overlap gene ids for later
overlap_gene_ids <- query

GVC ∩ Agora

genes <- p %>% 
  filter(..set.. %in% c("GVC∩Agora∩OpenTargets", "(GVC∩Agora)∖(OpenTargets)")) %>%
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC ∩ OpenTargets

genes <- p %>% 
  filter(..set.. %in% c("GVC∩Agora∩OpenTargets", "(GVC∩OpenTargets)∖(Agora)")) %>%
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

Agora ∩ OpenTargets

genes <- p %>% 
  filter(..set.. %in% c("GVC∩Agora∩OpenTargets", "(Agora∩OpenTargets)∖(GVC)")) %>%
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(GVC ∩ Agora) ∪ (GVC ∩ OpenTargets) ∪ (Agora ∩ OpenTargets)

genes <- p %>% 
  filter(..set.. %in% c("GVC∩Agora∩OpenTargets", "(GVC∩Agora)∖(OpenTargets)", "(GVC∩OpenTargets)∖(Agora)", "(Agora∩OpenTargets)∖(GVC)")) %>%
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(Agora ∩ OpenTargets) ∖ (GVC)

genes <- p %>%
  filter(..set.. == "(Agora∩OpenTargets)∖(GVC)") %>% 
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(GVC ∩ OpenTargets) ∖ (Agora)

genes <- p %>%
  filter(..set.. == "(GVC∩OpenTargets)∖(Agora)") %>% 
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(OpenTargets) ∖ (GVC ∪ Agora)

genes <- p %>%
  filter(..set.. == "(OpenTargets)∖(GVC∪Agora)") %>% 
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(GVC ∩ Agora) ∖ (OpenTargets)

genes <- p %>%
  filter(..set.. == "(GVC∩Agora)∖(OpenTargets)") %>% 
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(Agora) ∖ (GVC ∪ OpenTargets)

genes <- p %>%
  filter(..set.. == "(Agora)∖(GVC∪OpenTargets)") %>% 
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

(GVC) ∖ (Agora ∪ OpenTargets)

genes <- p %>%
  filter(..set.. == "(GVC)∖(Agora∪OpenTargets)") %>% 
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  select(gene_id, symbol, genetics_score, otGeneticsPortal, globalScore, target_risk_score, multi_omics_score) %>%
  arrange(desc(genetics_score), desc(otGeneticsPortal), desc(target_risk_score), desc(globalScore))

genes
query <- genes %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  correction_method = "gSCS")

gostres$result %>% select(term_name, term_id, source, everything())
gostplot(gostres, capped = FALSE, interactive = TRUE)

ORA of GVC genes

GVC genes (within 1Mb flanking regions of GVC loci)

Important

unordered query

d0 <- gvc.genes %>% select(gene_id, gene_symbol)

d0
query <- d0 %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes

Important

unordered query

d0.minus_apoe_hla <- gvc.genes.minus_apoe_hla %>% select(gene_id, gene_symbol)

d0.minus_apoe_hla
query <- d0.minus_apoe_hla %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 200Kb flanking regions of GVC loci) minus APOE and HLA loci genes

Important

unordered query

d0.minus_apoe_hla.200kb <- gvc.genes.minus_apoe_hla %>% filter(absolute_distance <= 200000) %>% select(gene_id, gene_symbol)

d0.minus_apoe_hla.200kb
query <- d0.minus_apoe_hla.200kb %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 20Kb flanking regions of GVC loci) minus APOE and HLA loci genes

Important

unordered query

d0.minus_apoe_hla.20kb <- gvc.genes.minus_apoe_hla %>% filter(absolute_distance <= 20000) %>% select(gene_id, gene_symbol)

d0.minus_apoe_hla.20kb
query <- d0.minus_apoe_hla.20kb %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes, ordered by absolute distance from GVC loci

Important

query ordered by absolute distance

d0.minus_apoe_hla <- gvc.genes.minus_apoe_hla %>% arrange(absolute_distance) %>% select(gene_id, gene_symbol)

d0.minus_apoe_hla
query <- d0.minus_apoe_hla %>% distinct(gene_id) %>% pull(gene_id)

writeLines(query, "enrichmentmap/gvc.genes.minus_apoe_hla.absolute_distance.query.txt")

multiquery <- c("> gvc.genes.minus_apoe_hla.absolute_distance", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE, # <- ORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Perform alternative ORA of GO:CP using GOrilla <2024-10-24>:

read_tsv("gvc.genes.minus_apoe_hla.gorilla.tsv") %>% select(Description, everything())

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes sorted by Agora’s genetics_score

Note

ago1 is used instead of ago to annotate a larger proportion of GVC genes with Agora scores

d1 <- gvc.genes.minus_apoe_hla %>%
  left_join(ago1, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(genetics_score)) %>%
  select(-c(symbol, hgnc_symbol)) %>% 
  select(gene_id, gene_symbol, genetics_score, otGeneticsPortal, everything())

d1
query <- d1 %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes sorted by OpenTargets’ otGeneticsPortal

Note

ago1 is used instead of ago to annotate a larger proportion of GVC genes with Agora scores

d2 <- gvc.genes.minus_apoe_hla %>%
  left_join(ago1, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(otGeneticsPortal)) %>%
  select(-c(symbol, hgnc_symbol)) %>% 
  select(gene_id, gene_symbol, otGeneticsPortal, genetics_score, everything())

d2
query <- d2 %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes sorted by Agora’s target_risk_score

Note

ago1 is used instead of ago to annotate a larger proportion of GVC genes with Agora scores

d3 <- gvc.genes.minus_apoe_hla %>%
  left_join(ago1, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(target_risk_score)) %>%
  select(-c(symbol, hgnc_symbol)) %>% 
  select(gene_id, gene_symbol, target_risk_score, globalScore, everything())

d3
query <- d3 %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes sorted by OpenTargets’ globalScore

Note

ago1 is used instead of ago to annotate a larger proportion of GVC genes with Agora scores

d4 <- gvc.genes.minus_apoe_hla %>%
  left_join(ago1, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(globalScore)) %>%
  select(-c(symbol, hgnc_symbol)) %>% 
  select(gene_id, gene_symbol, globalScore, target_risk_score, everything())

d4
query <- d4 %>% distinct(gene_id) %>% pull(gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

ORA of Agora genes

Agora genes sorted by genetics_score

d5 <- ago %>%
  drop_na(genetics_score) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(genetics_score))

d5
query <- d5 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Agora genes sorted by multi_omics_score

d6 <- ago %>%
  drop_na(multi_omics_score) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(multi_omics_score))

d6
query <- d6 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Agora genes sorted by target_risk_score

d7 <- ago %>%
  drop_na(target_risk_score) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(target_risk_score))

d7
query <- d7 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

writeLines(query, "enrichmentmap/agora.genes.target_risk_score.query.txt")

multiquery <- c(multiquery, "> agora.genes.target_risk_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

ORA of OpenTargets genes

OpenTargets genes sorted by otGeneticsPortal

d8 <- otensg %>%
  drop_na(otGeneticsPortal) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(otGeneticsPortal))

d8
query <- d8 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

OpenTargets genes sorted by globalScore

d9 <- otensg %>%
  drop_na(globalScore) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(globalScore))

d9
query <- d9 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

writeLines(query, "enrichmentmap/opentargets.genes.global_score.query.txt")

multiquery <- c(multiquery, "> opentargets.genes.global_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Write multiquery to file for later use in enrichmentmap

writeLines(multiquery, "enrichmentmap/multiquery.txt")

ORA of GVC genes in table from manuscript

d10 <- read_xlsx("2024-08-29_GVC Table 1C - WORKING COPY.xlsx", sheet = "PG Gene List", skip = 1)
query <-
  d10 %>%
  rename(gene = `GVC expanded list of possible genes (500kb)`) %>%
  bind_rows(tibble(gene = "APOE")) %>%
  distinct(gene) %>%
  pull(gene)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Check missing OpenTargets scores in table from manuscript

t <- read_xlsx("8-23-2024 - GVC Table 1C - WORKING COPYL_MRC.xlsx", skip = 1, na = "No data") %>% janitor::clean_names() %>% select(gvc_expanded_list_of_possible_genes_500kb, open_target_scores_global, open_target_scores_genetics) %>% rename(symbol = gvc_expanded_list_of_possible_genes_500kb)
t %>%
  left_join(ot, by = "symbol") %>%
  filter(round(open_target_scores_global, 4) != round(globalScore, 4)) %>% 
  select(symbol, open_target_scores_global, globalScore)
t %>%
  left_join(ot, by = "symbol") %>%
  filter(round(open_target_scores_genetics, 4) != round(otGeneticsPortal, 4)) %>% 
  select(symbol, open_target_scores_genetics, otGeneticsPortal)

GVC loci annotated with genes in overlaps

GVC ∩ Agora ∩ OpenTargets

gene_ids <- p %>% 
  filter(..set.. == "GVC∩Agora∩OpenTargets") %>%
  unnest(..values..) %>%
  select(gene_id = ..values..) %>%
  left_join(ago, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>%
  distinct(gene_id) %>%
  pull(gene_id)

length(gene_ids)
[1] 75
gvc %>%
  filter(gene_id %in% gene_ids) %>%
  select(gvc_locus = grouped_loci_gvc, gene_id, gene_symbol) %>%
  arrange(gene_symbol) %>%
  mutate(gene = gene_symbol) %>%
  # unite(gene, gene_id, gene_symbol, sep = ":", remove = FALSE) %>%
  distinct(gvc_locus, gene, .keep_all = TRUE) %>%
  group_by(gvc_locus) %>%
  summarize(genes = str_c(gene, collapse = " | ")) %>%
  select(gvc_locus, genes) %>%
  gt::gt()
gvc_locus genes
ABCA7 ABCA7 | NDUFS7
ABI3 / ACE NGFR | ZNF652
ACE ACE
ADAM10 / MINDY2 ADAM10 | ALDH1A2 | LIPC
ADAMTS4 ADAMTS4 | FCER1G | NDUFS2
ANK3 / CCDC6 CCDC6 | SLC16A9
ANKRD31 ANKRD31 | ENC1
APH1B LACTB
APOE / TOMM40 APOC1 | APOE | BCAM | MARK4 | NECTIN2
APP MRPL39
APP / ADAMTS1 ADAMTS1
BCKDK / KAT8 / VKORC1 BCKDK | STX4 | VKORC1
BIN1 BIN1
CASS4 CASS4
CD2AP CD2AP
CD33 CD33
CHRNE ENO3 | RABEP1 | SLC25A11 | ZFP3
CLU / PTK2B CLU | EPHX2 | PTK2B | SCARA3
CR1 CR1
CTSH CTSH
DOC2A DOC2A
ECHDC3 / USP6NL USP6NL
EED / PICALM DLG2 | PICALM
EPHA1 / EPHA1-AS1 EPHA1
HAVCR2 CYFIP2 | HAVCR2
HLA HLA-DRA | HLA-DRB1
ICA1 NXPH1
IDUA CPLX1
IL34 MTSS2
INPP5D INPP5D
LILRB2 / TMC4 LAIR1
MADD / SPI1 C1QTNF4 | NDUFS3 | NR1H3 | RAPSN | SPI1
MS4A / MS4A2 / MS4A4A / MS4A6A MRPL16 | MS4A2 | MS4A4A | MS4A6A
NDUFAF7 / PRKD3 QPCT
NYAP1 / PILRA / SPDYE3 / ZCWPW1 NYAP1
OARD1 / TREM2 / TREML2 / UNC5CL TREM2
PLCG2 PLCG2 | SDR42E1
PLEKHA1 HTRA1
RABEP1 / SCIMP ENO3 | RABEP1 | SLC25A11 | ZFP3
RASGEF1C MAPK9
RIN3 / SLC24A4 RIN3 | SLC24A4
SHARPIN PLEC
SIGLEC11 NR1H2
WNT3 NSF

Table of GVC genes sorted by combined rank (GVC absolute distance, Agora and OpenTargets scores)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes sorted by GVC’s absolute_distance, Agora’s target_risk_score and OpenTargets’ globalScore.

Genes are sorted based on a weighted average rank with the following weights:

  • 0.50 for GVC’s absolute_distance
  • 0.25 for Agora’s target_risk_score
  • 0.25 for OpenTargets’ globalScore
Note

ago1 is used instead of ago to annotate a larger proportion of GVC genes with Agora scores

weights <- c(0.50, 0.25, 0.25)  # weights for gvc, agora, open_targets ranks

d <- gvc.genes.minus_apoe_hla %>%
  left_join(ago1, by = join_by(gene_id == ensembl_gene_id)) %>% 
  left_join(otensg, by = join_by(gene_id == ensembl_gene_id)) %>% 
  sample_frac(1L) %>% # randomize row order before ranking
  mutate(gvc_rank = rank(absolute_distance, ties.method = "min"),
         agora_rank = rank(-target_risk_score, ties.method = "min"),
         opentargets_rank = rank(-globalScore, ties.method = "min")) %>%
  mutate(combined_rank = rowSums(select(., gvc_rank, agora_rank, opentargets_rank) * weights)) %>%
  mutate(combined_rank = rank(combined_rank, ties.method = "min")) %>%
  arrange(combined_rank) %>%
  select(gene_id,
         gene_symbol,
         combined_rank,
         gvc_rank, agora_rank,
         opentargets_rank,
         absolute_distance,
         target_risk_score,
         genetics_score,
         multi_omics_score,
         globalScore,
         otGeneticsPortal)

d

Download the table above as an Excel spreadsheet here.

read_tsv("gvc_1361.txt") %>% filter(GVC %notin% gvc.genes$gene_symbol) %>% arrange(GVC)

Correlation between GVC, Agora and OpenTargets scores

GVC vs Agora

d %>%
  drop_na(absolute_distance, target_risk_score) %>%
  summarize(cor = tidy(cor.test(-absolute_distance, target_risk_score, method="kendall"))) %>%
  unnest(cor)
d %>%
  drop_na(absolute_distance, genetics_score) %>%
  summarize(cor = tidy(cor.test(-absolute_distance, genetics_score, method="kendall"))) %>%
  unnest(cor)
d %>%
  drop_na(absolute_distance, multi_omics_score) %>%
  summarize(cor = tidy(cor.test(-absolute_distance, multi_omics_score, method="kendall"))) %>%
  unnest(cor)

GVC vs OpenTargets

d %>%
  drop_na(absolute_distance, globalScore) %>%
  summarize(cor = tidy(cor.test(-absolute_distance, globalScore, method="kendall"))) %>%
  unnest(cor)
d %>%
  drop_na(absolute_distance, otGeneticsPortal) %>%
  summarize(cor = tidy(cor.test(-absolute_distance, otGeneticsPortal, method="kendall"))) %>%
  unnest(cor)

Agora vs OpenTargets

d %>%
  drop_na(target_risk_score, globalScore) %>%
  summarize(cor = tidy(cor.test(target_risk_score, globalScore, method="kendall"))) %>%
  unnest(cor)

Methods for manuscript

Phase 2. Pathway analysis of GVC, Agora and OpenTargets candidate AD genes. We conducted gene set over-representation analysis (ORA) of GVC, Agora and OpenTargets candidate AD gene lists using R[37] with the gprofiler2 package (see https://github.com/marcoralab/gvc_agora_opentargets), excluding electronic Gene Ontology (GO) annotations, and filtering results using a p-value significance threshold of 0.005 after multiple testing correction with the g:SCS algorithm.

We used the GVC gene list of 1,344 genes in the proximity of AD risk loci that we built as described above. We retrieved Agora’s gene list of Alzheimer’s disease nominated targets (site version 3.4.0; data version syn13363290-v68) from https://agora.adknowledgeportal.org/genes/nominated-targets and the Agora’s gene scores (data version syn25741025-v12) from https://www.synapse.org/Synapse:syn25741025 on October 24th 2024. We retrieved OpenTargets’ gene list of Alzheimer’s disease (EFO:MONDO_0004975) associated targets (data version v24_09) from https://platform.opentargets.org/disease/MONDO_0004975/associations on October 24th 2024.

We conducted ORA using the following candidate AD gene lists: 1) Agora’s gene list of Alzheimer’s disease nominated targets decreasingly ordered by Agora’s genetics, multi-omics, or target risk score; 2) OpenTargets’ gene list of Alzheimer’s disease associated targets decreasingly ordered by OpenTargets’ genetics portal or global score; 3) GVC gene list decreasingly ordered by the aforementioned Agora’s or OpenTargets’ scores; 4) lists corresponding to regions of the Venn diagram built using R[37] with the VennDiagram package (see https://github.com/marcoralab/gvc_agora_opentargets) and three sets corresponding to gene lists 1-3, decreasingly ordered by the aforementioned Agora’s or OpenTargets’ scores.